########################################
## binary_simulations.R
## File to run the binary simulations from
## Blackwell & Olson (2021)
########################################

library(glmnet)
library(inters)
library(sandwich)
library(hdm)

args <- commandArgs(TRUE)
if (is.null(args)) args <- c(0, 750, 0, 0, 100, 20, "binary")
mc.cores <- 1
source("getcoef_sim.R")
run <- as.integer(args[1])
n_units <- as.integer(args[2])
R22  <- as.numeric(args[3])
R21  <- as.numeric(args[4])
n_covs  <- as.integer(args[5])
sims <- as.integer(args[6])
type <- args[7]

## original simulations run on R 3.5.1. After 3.6.0, R changed the RNG
## for the sample function, which leads to non-reproducible results
## for the simulations.
if (getRversion() >= "3.6.0") RNGkind(sample.kind = "Rounding")
set.seed(12345 + run)

if (!dir.exists("output")) dir.create("output")
out_file <- paste0(
  "output/", type, "_sim_", n_units, "_",
  R22, "_", R21, "_", n_covs, ".csv"
)


coef_oracle <- matrix(NA, nrow = sims, ncol = 2)
coef_lasso <- matrix(NA, nrow = sims, ncol = 2)
coef_pds <- matrix(NA, nrow = sims, ncol = 2)
coef_ols <- matrix(NA, nrow = sims, ncol = 2)
coef_sing <- matrix(NA, nrow = sims, ncol = 2)
ses_oracle <- matrix(NA, nrow = sims, ncol = 2)
ses_lasso <- matrix(NA, nrow = sims, ncol = 2)
ses_pds <- matrix(NA, nrow = sims, ncol = 2)
ses_ols <- matrix(NA, nrow = sims, ncol = 2)
ses_sing <- matrix(NA, nrow = sims, ncol = 2)

# Covariates X are drawn from a multivariate normal distribution
# Covariates X are drawn from a multivariate normal distribution
sigma <- toeplitz(0.5 ^ (0:(n_covs - 1)))

betas <- get_coefs(type = type, n_covs,
                   R2_d = R21, R2_y = R22)
beta_y  <- betas$beta_y
beta_v  <- betas$beta_v
beta_d  <- betas$beta_d
beta_vy <- betas$beta_vy
beta_dy <- betas$beta_dy
beta_vd <- betas$beta_vd

# set up treatment effect and moderator effect
treatment_effect <- 0.1
moderator_effect <- 0.25
alpha0 <- 0.25

sim_iter <- function(s, ...) {
  if ( (s / sims * 100) %% 10 == 0) {
    cat("Iteration: ", s, "/", sims, "\n")
  }

  out <- data.frame(n_units = n_units, n_covs = n_covs, dgp = type,
                    R2_d = R21, R2_y = R22)

  X <- MASS::mvrnorm(n = n_units, mu = rep(0, n_covs), Sigma = sigma)
  X <- scale(X, scale = FALSE)
  c_names <- paste("X", 1:n_covs, sep = "")
  colnames(X) <- c_names
  # now generate V (moderator) and D (treatment) using X and relevant betas
  V <- rbinom(n_units, 1, boot::inv.logit(X %*% beta_v))
  XV <- X * V
  colnames(XV) <- paste("XV", 1:n_covs, sep = "")
  d <- X %*% beta_d + V * 0.25 + XV %*% beta_vd + rnorm(n_units)
  d_X <- c(d) * X
  y_lp <- X %*% beta_y + d * treatment_effect + V  * moderator_effect +
    alpha0 * d * V + (XV) %*% beta_vy + d_X %*% beta_dy
  y <- rbinom(n_units, 1, boot::inv.logit(y_lp))


  sim_data <- data.frame(y = y, d = c(d), d_V = c(d) * V, V = V, X, XV)
  big_x <- cbind(d = c(d), d_V = c(d) * V, V = V, X, XV)
  
  ## oracle
  oracle_out <- glm(y ~ d * V + X[,1:15] + XV[,1:15],
                    family = binomial())
  oracle_vcv <- sandwich::vcovHC(oracle_out)
  out$oracle_main <- oracle_out$coefficients["d"]
  out$oracle_int <- oracle_out$coefficients["d:V"]
  out$oracle_mainse <- sqrt(oracle_vcv["d", "d"])
  out$oracle_intse <- sqrt(oracle_vcv["d:V", "d:V"])
  
  ## post-double selection
  I3 <- c(FALSE, FALSE, rep(FALSE, ncol(X) + 1), rep(FALSE, ncol(XV)))
  pds_out <- hdm::rlassologitEffects(x = big_x, y = y, index = c(1, 2),
                                       I3 = NULL)
  out$pds_main <- pds_out$coefficients["d"]
  out$pds_int <- pds_out$coefficients["d_V"]
  out$pds_mainse <- pds_out$se["d"]
  out$pds_intse <- pds_out$se["d_V"]

  ## standard lasso
  lasso_out <- hdm::rlassologit(x = big_x, y = y, post = TRUE)
  selected <- c(TRUE, lasso_out$index)
  selected[2:(n_covs + 4)] <- TRUE
  seldata <- sim_data[, which(selected)]
  post_lasso <- glm(y ~ ., data = seldata)
  coef_lasso[s, ] <- post_lasso$coefficients[c("d", "d_V")]
  ses_lasso <- sqrt(diag(sandwich::vcovHC(post_lasso)))[c("d", "d_V")]
  out$lasso_main <- lasso_out$coefficients["d"]
  out$lasso_int <- lasso_out$coefficients["d_V"]
  out$lasso_mainse <- ses_lasso["d"]
  out$lasso_intse <- ses_lasso["d_V"]

  ## fully moderated
  out$fully_main <- NA
  out$fully_int <- NA
  out$fully_mainse <- NA
  out$fully_intse <- NA

  if (n_covs < 50) {
    fully <- glm(y ~ ., data = sim_data, family = "binomial")
    fully_vcv <- sandwich::vcovHC(fully)    
    if (fully$converged) {
      out$fully_main <- fully$coefficients["d"]
      out$fully_int <- fully$coefficients["d_V"]
      out$fully_mainse <- sqrt(fully_vcv["d", "d"])
      out$fully_intse <- sqrt(fully_vcv["d_V", "d_V"])
    }
  }

  out$single_main <- NA
  out$single_int <- NA
  out$single_mainse <- NA
  out$single_intse <- NA

  ## single interaction
  single <- glm(y ~ d + d_V + V + X, data = sim_data, family = "binomial")
  if (single$converged) {
    single_vcv <- sandwich::vcovHC(single)
    out$single_main <- single$coefficients["d"]
    out$single_int <- single$coefficients["d_V"]
    out$single_mainse <- sqrt(single_vcv["d", "d"])
    out$single_intse <- sqrt(single_vcv["d_V", "d_V"])
  }
  out
}
  
sim_out <- parallel::mclapply(1:sims, sim_iter, mc.cores = mc.cores)
out <- do.call(rbind, sim_out)
write.csv(out, file = out_file, row.names = FALSE)
